# coding=utf-8

#verfication code recognize
#https://www.cnblogs.com/TTyb/p/6156395.html?from=timeline&isappinstalled=0
import pandas as pd
import random
import re
import time
from pyvirtualdisplay import Display
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
import codecs
import signal
import linecache
import urllib
from PIL import Image
import math
import sys, os



#def
# 夹角公式
class VectorCompare:
    # 计算矢量大小
    # 计算平方和
    def magnitude(self, concordance):
        total = 0
        for word, count in concordance.items():
            total += count ** 2
        return math.sqrt(total)

    # 计算矢量之间的 cos 值
    def relation(self, concordance1, concordance2):
        topvalue = 0
        for word, count in concordance1.items():
            if word in concordance2:
                # 计算相乘的和
                topvalue += count * concordance2[word]
        return topvalue / (self.magnitude(concordance1) * self.magnitude(concordance2))

def buildvector(im):
    d1 = {}
    count = 0
    for i in im.getdata():
        d1[count] = i
        count += 1
    return d1




def recognize_verfication_code(item):
    try:
        newjpgname = []
        im = Image.open(item)
        # jpg不是最低像素，gif才是，所以要转换像素
        im = im.convert("P")

        # 像素直方图
        his = im.histogram()

        values = {}
        for i in range(0, 256):
            values[i] = his[i]

        # 排序，x:x[1]是按照括号内第二个字段进行排序,x:x[0]是按照第一个字段
        # temp = sorted(values.items(), key=lambda x: x[1], reverse=True)

        # 占比最多的10种颜色
        # for j, k in temp[:10]:
        #     print(j, k)
        # 255 12177
        # 0 772
        # 254 94
        # 1 40
        # 245 10
        # 12 9
        # 236 9
        # 243 9
        # 2 8
        # 6 8
        # 255是白底，0是黑色，可以打印来看看0和254

        # 获取图片大小，生成一张白底255的图片________________________黑白处理
        im2 = Image.new("P", im.size, 255)
        for y in range(im.size[1]):
            # 获得y坐标
            for x in range(im.size[0]):

                # 获得坐标(x,y)的RGB值
                pix = im.getpixel((x, y))

                # 这些是要得到的数字
                # 事实证明只要0就行，254是斑点
                if pix == 0:
                    # 将黑色0填充到im2中
                    im2.putpixel((x, y), 0)
        # 生成了一张黑白二值照片---------------------------------------------------------
        # im2.show()

        # 纵向切割---------------------------------------------------------------
        # 找到切割的起始和结束的横坐标
        inletter = False
        foundletter = False
        start = 0
        end = 0

        letters = []

        for x in range(im2.size[0]):
            for y in range(im2.size[1]):
                pix = im2.getpixel((x, y))
                if pix != 255:
                    inletter = True
            if foundletter == False and inletter == True:
                foundletter = True
                start = x

            if foundletter == True and inletter == False:
                foundletter = False
                end = x
                letters.append((start, end))

            inletter = False
        # print(letters)
        # [(27, 47), (48, 71), (73, 101), (102, 120), (122, 147), (148, 166)]
        # 打印出6个点，说明能切割成6个字母，正确

        for letter in letters:
            # (切割的起始横坐标，起始纵坐标，切割的宽度，切割的高度)
            im3 = im2.crop((letter[0], 0, letter[1], im2.size[1]))



            # ____________________________________________________________________________________

        # 加载训练集-----------------------用相似度找到最合适的字母
        v = VectorCompare()

        # 开始破解训练
        count = 0
        for letter in letters:
            # (切割的起始横坐标，起始纵坐标，切割的宽度，切割的高度)
            im3 = im2.crop((letter[0], 0, letter[1], im2.size[1]))

            guess = []
            # 将切割得到的验证码小片段与每个训练片段进行比较
            for image in imageset:
                for x, y in image.items():
                    if len(y) != 0:
                        guess.append((v.relation(y[0], buildvector(im3)), x))

            # 排序选出夹角最小的（即cos值最大）的向量，夹角越小则越接近重合，匹配越接近
            guess.sort(reverse=True)
            newjpgname.append(guess[0][1])
            count += 1

        # 得到拼接后的验证码识别图像
        newname = str("".join(newjpgname))  # --------------------------------------

        # 向框输入newname    ???????????????
        print (newname)
        return (newname)
    except Exception as err:
        print('identifying code error --------------')
        print(err)
        # 如果错误就记录下来



        # 开启多进程
        # def runthreading():
        #     pool = ThreadPoolExecutor(5)
        # jpgname = listfiles(path, "jpg")         #传入图片，生成路径列表
        # for item in jpgname:
        #     # 识别过的就不再识别了
        #     if len(item)>30:
        #         pool.submit(main, item)



# yanzheng 全局变量

# iconset = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
#            'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
#que   d,i,o,q,s,v,w,z
iconset = ['a','b','c','e','f','g','h','j','k','l','m','n','p','r','t','u','x','y']


imageset = []
for letter in iconset:
    for img in os.listdir('iconset1/%s/' % (letter)):
        temp = []
        if img != "":
            temp.append(buildvector(Image.open("iconset1/%s/%s" % (letter, img))))
        imageset.append({letter: temp})










display = Display(visible=0, size=(800, 800))
display.start()
driver = webdriver.Chrome()
#driver = webdriver.Firefox()
global saveFlag
saveFlag = True

li_url = []     #新加

# 计时用的函数
def timed_out(b, c):
    print('alarmed')
    raise RuntimeError()

def child_exit(b,c):
    ret = os.wait()
    print('child exiting',ret)


def read_config():
    num1 = 0
    num2 = 0
    if os.path.exists('save_goods.csv'):
        load2 = pd.read_csv('save_goods.csv', header=None, encoding='utf-8', sep=',')
        num1 = load2.iat[0, 0]
        num2 = load2.iat[0, 1]
    return num1,num2

def write_config(number,times):
    save = codecs.open('save_goods.csv', "w+", encoding='utf-8')
    save.write(str(number) + ',' + str(times))
    save.close()

# 获取商品页面内容，返回页面html
def gethtml2(goodsUrl):
    global driver
    global display
    try_times = 0
    content= ''
    while try_times < 100:

        # 配合计时函数，开一个信号来计算时间，超时便报错，然后except重新再爬这个网址。要是200次都报错就跳过该网址
        # （有时候因为网络或者模拟浏览器的问题，程序会卡住但不会报错跳出，用这个方法识别超时，手动报出错，catch错误后重新爬）
        #signal.signal(signal.SIGALRM, timed_out)
        #signal.setitimer(signal.ITIMER_REAL, 10, 0)

        try:

            # 模拟chrome，并消除浏览器图像化，以便在服务器里运行

            try_times = try_times + 1

            # 正则去除连接上的反爬虫部分
            re_url = re.compile('/\d{3}-\d{7}-\d{7}')
            url = re_url.sub('', goodsUrl).replace('amp;', '').replace('&th=1', '')
            driver.get(url)

            print(driver.current_url)

            content = driver.page_source.encode('utf-8', 'ignore').decode()
            if content.find('您输入的网址在我们的网站上无法正常显示网页')!= -1:
                print('good is missing')
                content=""
            elif len(content) < 1024 * 5:
                print('block 2...', len(content))
                raise RuntimeError()
            else:
			    #print('block 1...')
                #f= open('/tmp/aa','w')
                #f.write(content)
                #f.close()
                #driver.get_screenshot_as_file("/tmp/aa.png")
                #raise RuntimeError()

                while (content.find('抱歉，我们只是想确认一下当前访问者并非自动程序')) != -1:
                    print('block 1...')
                    # f = open('/tmp/fp.txt', 'w')
                    # f.write(content)
                    # f.close()

                    # 找正则表达式
                    pi = r'<img src="(.*?)" />'
                    pi_url = re.findall(pi, str(content), re.S | re.M)  # findall下是一个列表
                    # 获取图片
                    if len(pi_url) > 0:
                        data = urllib.request.urlopen(pi_url[0]).read()
                        address = 'pi/pi.jpg'
                        w = open(address, 'wb')
                        w.write(data)
                        w.close()

                        de = recognize_verfication_code(address)
                    else:
                        print('pi_url is null')
                    # 输入验证码
                    driver.find_element_by_id("captchacharacters").send_keys(str(de))
                    # driver.find_element_by_name("继续购物").click()
                    driver.find_element_by_class_name('a-button-text').click()
                    time.sleep(0.5)

                    re_url = re.compile('/\d{3}-\d{7}-\d{7}')
                    url = re_url.sub('', driver.current_url).replace('amp;', '').replace('&th=1', '')
                    driver.get(url)

                    # 打印当前网址
                    print(driver.current_url)
                    content = driver.page_source.encode('utf-8', 'ignore').decode()

                    # index = random.randint(0, 15)
                    # user_agent = user_agents[index]
                    # head = {'User-Agent': user_agent}
                    # req = request.Request(url_root, headers=head)
                    # response = urllib.request.urlopen(req)
                    # html = response.read().decode('utf-8')
            break




        except Exception as e:
            print(try_times,e)
            try:

                #driver.quit()
                #display.stop()
                print('kill all')
                os.system('pkill -9 chrome')
                os.system('pkill -9 Xvfb')
                os.system('pkill -9 chromedriver')
                #os.system('pkill -9 geckodriver')
                os.system('rm -rf /tmp/.com.google.Chrome*')
                os.system('rm -rf /tmp/.org.chromium*')
                print('sleep...')
                time.sleep(1)
                print('starting')
                #time.sleep(random.randint(1, 3))
                display = Display(visible=0, size=(800, 800))
                display.start()
                driver = webdriver.Chrome()
                driver.delete_all_cookies()
                print('started')
            except Exception as e:
                print('!',e)
    return content

# 压缩html代码，去除脚本和header
def compress(html):
    dr = re.compile(r'<script(.*?)</script>', re.S | re.M)
    Info = dr.sub('', str(html))
    dr3 = re.compile(r'<style(.*?)</style>', re.S | re.M)
    Info2 = dr3.sub('', Info)
    dr2 = re.compile(r'<noscript>(.*?)</noscript>', re.S | re.M)
    Info3 = dr2.sub('', Info2)
    categoryRanke = re.findall('<div class="nav-search-facade" data-value="search-alias=aps">(.*?)</div>', Info3, re.S | re.M)
    dr4 = re.compile(r'<header class=(.*?)</header>', re.S | re.M)
    Info4 = dr4.sub('', Info3)
    if len(categoryRanke) == 0:
        categoryRanke = [1]
        categoryRanke[0] = ''
    txt = (categoryRanke[0] + Info4).replace('\n','').replace('  ','')
    return txt

def check_file_is_normal(f):
    if os.path.exists(f) == False:
        return False
    if os.path.getsize(f) == 0:
        return False
    return True

def goods(number, num2,filename):
    # 注意路径
    list = linecache.getlines("./urls/"+filename)

# 需要的话，读取进度，读取一次后，saveFlag变成false，本次运行再也不会读取
    global saveFlag
    num = 0
    if saveFlag == True:
        num = num2
        saveFlag = False

    # 遍历页面下的每个商品
    for times in range(num, len(list)):
        print('category code: ', str(filename).replace('.txt',''))
        print('number:', times)


        path = 'contents/' + str(filename).replace('.txt','') + '/'

        if os.path.exists(path) == False:
            os.mkdir(path)

        if check_file_is_normal(path + str(times) + '.txt') == False:
            content = gethtml2(list[times])
            allType = re.findall('<a class="a-link-normal" href="(.*?)">查看所有商品描述</a>', content, re.S | re.M)

            # 有些商品页面信息不全，会有一个‘详细信息’的页面，这时会用详细信息的页面替代原来获取的商品页面
            if len(allType) != 0:
                content = gethtml2('https://www.amazon.cn' + allType[0])
            # 保存页面文件
            output = codecs.open(path + str(times) + '.txt', "w+", encoding='utf-8')
            output.write(str(compress(content)))
            output.close()

        #保存当前爬虫进度
        write_config(number,times)


        if times > 4000:
            break


if __name__ == '__main__':

    print('starting...')
    # signal.signal(signal.SIGCLD,signal.SIG_IGN)
    #signal.signal(signal.SIGCHLD, signal.SIG_IGN)
    #signal.signal(signal.SIGCHLD, child_exit)
    num = 0
    num2 = 0
    #num,num2 = read_config()

    # 读取路径下的文件(.txt)个数,并排序
    filenames = os.listdir('./urls')

    tmp = []
    for f in filenames:
        if f.endswith(".txt"):
            tmp.append(f)
    filenames = tmp

    filenames.sort()
    #filenames.sort(key=lambda x: int(x[:-4]))
    for i in range(num,len(filenames)):
        goods(num,num2,filenames[i])






